home *** CD-ROM | disk | FTP | other *** search
- {$A+,B-,D-,E-,F-,G-,I-,L-,N-,O-,R-,S-,V+,X+}
- {$M 6144,8192,655360}
- Program Words;
- { WORDS - A word extracter program. Copyright 1990,91 by Edwin T. Floyd. }
- Uses Dos, Crt, Token, PairHeap;
-
- Const
- WordChar = ['a'..'z','A'..'Z']; { Default WordSet }
- DefaultOutput = ''; { Default output filename (''=stdout) }
- BufSize = 4096; { I/O buffer size }
-
- Type
- SetOpType = (Union, Intersection, Complement);
- SetOfChar = Set Of Char;
- SortEntryType = Object(HeapEntry)
- { Data structure used for sorting }
- Token : Word;
- End;
- SortHeapType = Object(Heap)
- { PairHeap compare function override }
- Function Less(Var x, y : HeapEntry) : Boolean; Virtual;
- End;
- FileEntryPtr = ^FileEntry;
- FileEntry = Record
- { Input file name list entry }
- NextFile : FileEntryPtr;
- FileName : PathStr;
- End;
-
- Const
- FileList : FileEntryPtr = Nil; { File name list (head) }
- LastFile : FileEntryPtr = Nil; { File name list (tail) }
- HashTab : PToken = Nil; { Hash table pointer }
- TestTab : PToken = Nil; { Test hash table pointer }
- WordCount : LongInt = 0; { Total number of words examined }
- ReturnCode : Word = 0; { Return code for Halt }
- WordSet : SetOfChar = WordChar; { Words are made of these }
- SetOp : SetOpType = Union; { Set operation }
- Alphabetize : Boolean = False; { If true, sort output words }
- LowerCase : Boolean = False; { If true, case is significant }
- HighOrder : Boolean = False; { If true, clear high-order bits }
- SuppressOutput : Boolean = False; { If true, do not write output file }
- OutOfMemory : Boolean = False; { Set true by HandleHeapError }
- Aborted : Boolean = False; { True if operator aborted }
- OutName : PathStr = DefaultOutput; { Output file name }
-
- Var
- OldMem : LongInt; { Original value of MemAvail }
- SortHeap : SortHeapType; { Sorter object }
- TextFile : File; { Input/Output file }
- TextBuf : Array[1..BufSize] Of Char; { I/O buffer for TextFile }
-
- {$S+}
-
- Function ProcessParameter(s : String) : Boolean; Forward;
-
- Function ParseParamString(s : String) : Boolean;
- { Extract parameters from a string and process them; return True if all OK. }
- Var
- i, j : Word;
- ParamsOk : Boolean;
- Begin
- ParamsOk := True;
- While (s <> '') And (s[Length(s)] = ' ') Do Dec(s[0]);
- While s <> '' Do Begin
- i := 1;
- While (i <= Length(s)) And (s[i] = ' ') Do Inc(i);
- j := Succ(i);
- While (j <= Length(s)) And (s[j] <> ' ') Do Inc(j);
- If Not ProcessParameter(Copy(s, i, j - i)) Then ParamsOk := False;
- Delete(s, 1, Pred(j));
- End;
- ParseParamString := ParamsOk;
- End;
-
- Function ProcessParameter(s : String) : Boolean;
- { Process command line parameter or file name; return True if OK. }
- Var
- ThisFile : FileEntryPtr;
- IncludeFile : Text;
- ParamOk : Boolean;
- i, j : Word;
- IoRes : Integer;
-
- Procedure GetFiles(Var s : String);
- Var
- Path : PathStr;
- Dir : DirStr;
- Name : NameStr;
- Ext : ExtStr;
- Search : SearchRec;
- Begin
- Path := FExpand(s);
- FSplit(Path, Dir, Name, Ext);
- FindFirst(Path, Archive, Search);
- If DosError <> 0 Then Begin
- WriteLn('No files match ', s);
- ParamOk := False;
- End;
- While DosError = 0 Do Begin
- Path := Dir + Search.Name;
- ThisFile := FileList;
- While (ThisFile <> Nil) And (ThisFile^.FileName <> Path) Do
- ThisFile := ThisFile^.NextFile;
- If ThisFile = Nil Then Begin
- New(ThisFile);
- If ThisFile <> Nil Then Begin
- With ThisFile^ Do Begin
- NextFile := Nil;
- FileName := Path;
- End;
- If LastFile = Nil Then FileList := ThisFile
- Else LastFile^.NextFile := ThisFile;
- LastFile := ThisFile;
- End;
- End Else WriteLn('Already in list: ', Path);
- FindNext(Search);
- End;
- End;
-
- Begin
- ParamOk := True;
- If (s[1] = '-') Or (s[1] = '/') Then Case UpCase(s[2]) Of
- 'U' : SetOp := Union;
- 'I' : SetOp := Intersection;
- 'C' : SetOp := Complement;
- 'A' : If s[3] = '-' Then Alphabetize := False Else Alphabetize := True;
- 'L' : If s[3] = '-' Then LowerCase := False Else LowerCase := True;
- 'H' : If s[3] = '-' Then HighOrder := False Else HighOrder := True;
- 'O' : Begin
- Delete(s, 1, 2);
- For i := 1 To Length(s) Do s[i] := UpCase(s[i]);
- If (s <> '') And ((s[1] = '-') Or (s = 'NUL')) Then Begin
- SuppressOutput := True;
- OutName := '-';
- End Else Begin
- SuppressOutput := False;
- If s = '' Then OutName := s Else OutName := FExpand(s);
- End;
- End;
- 'W' : Begin
- Delete(s, 1, 2);
- Case s[1] Of
- '+' : ;
- '-' : WordSet := [];
- Else Begin
- WriteLn('WordSet (-W) option must be followed by + or -.');
- ParamOk := False;
- End;
- End;
- Delete(s, 1, 1);
- For i := 1 To Length(s) Do
- WordSet := WordSet + [s[i]];
- End;
- Else Begin
- WriteLn('Unrecognized option: ', s);
- ParamOk := False;
- End;
- End Else If s[1] = '@' Then Begin
- Delete(s, 1, 1);
- For i := 1 To Length(s) Do s[i] := UpCase(s[i]);
- Assign(IncludeFile, s);
- Reset(IncludeFile);
- IoRes := IoResult;
- If IoRes = 0 Then Begin
- WriteLn('Processing include file ', s);
- Repeat
- ReadLn(IncludeFile, s);
- IoRes := IoResult;
- If IoRes = 0 Then If Not ParseParamString(s) Then ParamOk := False;
- Until Eof(IncludeFile) Or (IoRes <> 0);
- If IoRes <> 0 Then Begin
- WriteLn('Error ', IoRes, ' reading include file');
- ParamOk := False;
- End;
- Close(IncludeFile);
- IoRes := IoResult;
- End Else Begin
- WriteLn('Error ', IoRes, ' opening include file ', s);
- ParamOk := False;
- End;
- End Else GetFiles(s);
- ProcessParameter := ParamOk;
- End;
-
- Procedure ParseParams;
- { Interpret environment and command line parameters; display Help info. }
- Var
- i, j : Word;
- ParamsOk : Boolean;
- Ch : Char;
- s : String;
- Begin
- WriteLn('WORDS v1.2 - A word extractor program. Copyright (c) 1990,91 by Edwin T. Floyd.');
- ParamsOk := True;
- If Not ParseParamString(GetEnv('WORDS')) Then Begin
- WriteLn('Error found in SET WORDS=.. environment string');
- ParamsOk := False;
- End;
- For i := 1 To ParamCount Do Begin
- FillChar(s[1], 255, ' ');
- s := ParamStr(i);
- If Not ProcessParameter(s) Then ParamsOk := False;
- End;
- If Not ParamsOk Then Begin
- WriteLn('At least one parameter was in error. Run WORDS with no parameters');
- WriteLn('to see documentation.');
- Halt(1);
- End Else If FileList = Nil Then Begin
- WriteLn;
- WriteLn(' WORDS filenames.. [-U/-I/-C] [-A] [-L] [-H] [-W[+/-]abc..] [-Oname] [@name]' );
- WriteLn;
- WriteLn('All command line parameters are separated by spaces. Input text filenames');
- WriteLn('and options may be intermixed; options are distinguished by a leading hyphen:');
- WriteLn;
- WriteLn(' -U, -I or -C specifies the set operation to be performed on the extracted');
- WriteLn(' words from the files. The operations are:');
- WriteLn(' -U Union: Keep all unique words from any input file (default);');
- WriteLn(' -I Intersection: Keep unique words common to all files;');
- WriteLn(' -C Complement: Keep unique words from second and subsequent files only');
- WriteLn(' if they are not contained in the first file.');
- WriteLn(' -A[-] Sort output words alphabetically (default off).');
- WriteLn(' -H[-] Clear high-order bits on input file (i.e. WordStar, default off).');
- WriteLn(' -L[-] Lower case is significant (default off).');
- WriteLn(' -W-abc.. Replace the word character set with the indicated characters');
- WriteLn(' (default is all alphabetic characters, upper and lower case).');
- WriteLn(' -W+abc.. Add additional characters to the word character set.');
- WriteLn(' -O[name] Name the output file (default is name omitted => stdout).');
- WriteLn(' -O- Suppress output (counts are still displayed on screen).');
- WriteLn;
- WriteLn('The "@" prefixes the name of an ASCII include file which may contain');
- Write('filenames, options, and nested include files, in any order. ');
- Ch := ReadKey;
- WriteLn;
- WriteLn;
- WriteLn('You may use the DOS "SET" command to specify default parameters. Examples:');
- WriteLn;
- WriteLn(' SET WORDS=-U -A+ -L+ -Owords.out -W-ABCDEFGHIJKLMNOPQRSTUVWXYZ');
- WriteLn(' SET WORDS=@defaults.wrd -O');
- WriteLn;
- WriteLn('Command line parameters override "SET" parameters. WORDS examples:');
- WriteLn;
- WriteLn(' WORDS oldwords.lst document.txt -W+-'' -C -Onewwords.lst');
- WriteLn(' WORDS @filename.lst -I -Oallwords.txt');
- WriteLn(' WORDS file1.txt -A+ -U -L- -O | nextprog');
- WriteLn;
- WriteLn('WORDS was written by:');
- WriteLn;
- WriteLn(' Edwin T. Floyd [76067,747] (CompuServe)');
- WriteLn(' #9 Adams Park Court 404/576-3305 (work)');
- WriteLn(' Columbus, GA 31909 404/322-0076 (home)');
- Halt(0);
- End Else Begin
- Case SetOp Of
- Union : s := '-U';
- Intersection : s := '-I';
- Complement : s := '-C';
- End;
- If Alphabetize Then ch := '+' Else ch := '-';
- s := s + ' -A' + ch;
- If LowerCase Then ch := '+' Else ch := '-';
- s := s + ' -L' + ch;
- If HighOrder Then ch := '+' Else ch := '-';
- s := s + ' -H' + ch;
- OldMem := MemAvail;
- WriteLn('Options: ', s, ' -O', OutName, ', ',
- OldMem Shr 10, 'k free.');
- WriteLn('Press <Esc> to stop.');
- End;
- End;
-
- {$S-}
-
- Function SortHeapType.Less(Var x, y : HeapEntry) : Boolean;
- { Sort compare function override }
- Var
- xx : SortEntryType Absolute x;
- yy : SortEntryType Absolute y;
- Begin
- Less := HashTab^.TokenAddress(xx.Token)^ < HashTab^.TokenAddress(yy.Token)^;
- End;
-
- Function ParseInputBlock(Len : Word) : Word;
- { Insert words from input block into hash table }
- Var
- Words : Word;
- t : TokenString;
- i, Toss : Word;
- Begin
- i := 1;
- Words := 0;
- While i <= Len Do Begin
- t := '';
- While (i <= Len) And Not (TextBuf[i] In WordSet) Do Inc(i);
- If i <= Len Then Begin
- While (i <= Len) And (Length(t) < TokenStringSize)
- And (TextBuf[i] In WordSet) Do Begin
- Inc(t[0]);
- If LowerCase Then t[Ord(t[0])] := TextBuf[i]
- Else t[Ord(t[0])] := UpCase(TextBuf[i]);
- Inc(i);
- End;
- Inc(Words);
- Case SetOp Of
- Union : Toss := HashTab^.TokenInsertText(t);
- Intersection : If (TestTab <> Nil) And (TestTab^.TextToken(t) <> 0) Then
- Toss := HashTab^.TokenInsertText(t);
- Complement : If (TestTab <> Nil) And (TestTab^.TextToken(t) = 0) Then
- Toss := HashTab^.TokenInsertText(t);
- End;
- End;
- End;
- ParseInputBlock := Words;
- End;
-
- Procedure ProcessNextFile;
- { Open and process the next input file pointed to by FileList. }
- Var
- ThisFile : FileEntryPtr;
- TempTab : PToken;
- FileWords : LongInt;
- i, MaxLen, Len : Word;
- FileResult : Integer;
- Begin
- ThisFile := FileList;
- With ThisFile^ Do Begin
- Write(FileName, ': ');
- Assign(TextFile, FileName);
- Reset(TextFile, 1);
- FileResult := IoResult;
- If FileResult = 0 Then Begin
- If HashTab = Nil Then New(HashTab, Init);
- Len := 0;
- FileWords := 0;
- Repeat
- BlockRead(TextFile, TextBuf[Succ(Len)], BufSize-Len, i);
- FileResult := IoResult;
- If FileResult = 0 Then Begin
- MaxLen := Len + i;
- If HighOrder Then For i := Succ(Len) To MaxLen Do
- TextBuf[i] := Chr(Ord(TextBuf[i]) And $7F);
- Len := MaxLen;
- If Not Eof(TextFile) Then Begin
- While (Len > 0) And (TextBuf[Len] In WordSet) Do Dec(Len);
- If (Len = 0) Then Len := MaxLen;
- End;
- FileWords := FileWords + ParseInputBlock(Len);
- MaxLen := MaxLen - Len;
- If MaxLen > 0 Then
- Move(TextBuf[Succ(Len)], TextBuf[1], MaxLen);
- Len := MaxLen;
- Write(^M, FileName, ': ', FileWords, ' words, ',
- HashTab^.TokMaxToken, ' kept, ', (OldMem-MemAvail) Shr 10, 'k');
- While KeyPressed Do If ReadKey = ^[ Then Aborted := True;
- End;
- Until Eof(TextFile) Or (FileResult <> 0) Or OutOfMemory Or Aborted;
- Close(TextFile);
- WriteLn(^M, FileName, ': ', FileWords, ' words, ',
- HashTab^.TokMaxToken, ' kept, ', (OldMem-MemAvail) Shr 10, 'k');
- WordCount := WordCount + FileWords;
- End Else WriteLn('Unable to open input file ', FileName);
- If FileResult <> 0 Then Begin
- WriteLn('Error ', FileResult);
- Inc(ReturnCode);
- End;
- FileList := NextFile;
- If SetOp = Intersection Then Begin
- TempTab := TestTab;
- TestTab := HashTab;
- HashTab := TempTab;
- If HashTab <> Nil Then Begin
- Dispose(HashTab, Done);
- HashTab := Nil;
- End;
- End;
- End;
- Dispose(ThisFile);
- End;
-
- Procedure ProcessFirstFile;
- { Process the first input file. }
- Var
- TempTab : PToken;
- Op : SetOpType;
- Begin
- Op := SetOp;
- SetOp := Union;
- ProcessNextFile;
- SetOp := Op;
- If SetOp In [Intersection, Complement] Then Begin
- TempTab := TestTab;
- TestTab := HashTab;
- HashTab := TempTab;
- End;
- End;
-
- Procedure SortWords;
- { Write words to output file, optionally sorted. }
- Var
- SortEntry : ^SortEntryType;
- FileResult : Integer;
- i : Word;
- OutFile : Text;
- Begin
- If SuppressOutput Then WriteLn('Output suppressed') Else Begin
- Assign(OutFile, OutName);
- SetTextBuf(OutFile, TextBuf);
- ReWrite(OutFile);
- FileResult := IoResult;
- If FileResult = 0 Then Begin
- If Alphabetize Then With SortHeap Do Begin
- Init;
- For i := 1 To HashTab^.TokMaxToken Do Begin
- New(SortEntry);
- If SortEntry <> Nil Then Begin
- SortEntry^.Token := i;
- Insert(SortEntry^);
- End;
- End;
- If OutOfMemory Then Begin
- WriteLn('Sort suppressed due to insufficient memory');
- Alphabetize := False;
- Inc(ReturnCode);
- End;
- End;
- If Alphabetize Then With SortHeap Do Begin
- Write('Sorting and writing ', EntryCount, ' words to ');
- If OutName = '' Then Write('<stdout>') Else Write(OutName);
- WriteLn(', ', (OldMem-MemAvail) Shr 10, 'k');
- For i := 1 To EntryCount Do Begin
- SortEntry := DeleteLowEntry;
- If FileResult = 0 Then Begin
- WriteLn(OutFile, HashTab^.TokenAddress(SortEntry^.Token)^);
- FileResult := IoResult;
- End;
- End;
- End Else Begin
- Write('Writing ', HashTab^.TokMaxToken, ' words to ');
- If OutName = '' Then WriteLn('<stdout>') Else WriteLn(OutName);
- For i := 1 To HashTab^.TokMaxToken Do If FileResult = 0 Then Begin
- WriteLn(OutFile, HashTab^.TokenAddress(i)^);
- FileResult := IoResult
- End;
- End;
- If FileResult <> 0 Then Begin
- WriteLn('Error ', FileResult, ' writing file ', OutName);
- Inc(ReturnCode);
- End;
- Close(OutFile);
- FileResult := IoResult;
- If FileResult <> 0 Then Begin
- WriteLn('Error ', FileResult, ' closing file ', OutName);
- Inc(ReturnCode);
- End;
- End Else WriteLn('Error ', FileResult, ' opening file ', OutName);
- End;
- End;
-
- {$F+}
- Function HandleHeapError(Size : Word) : Integer;
- Begin
- If Size > 0 Then Begin
- HandleHeapError := 1;
- OutOfMemory := True;
- End;
- End;
- {$F-}
-
- Begin
- FileMode := $40;
- HeapError := @HandleHeapError;
- OldMem := MemAvail;
- ParseParams;
- ProcessFirstFile;
- While (FileList <> Nil) And Not (OutOfMemory Or Aborted) Do ProcessNextFile;
- If OutOfMemory Then Begin
- WriteLn('Input file processing terminated due to insufficient memory');
- WriteLn('Words collected so far will be written to output file');
- Inc(ReturnCode);
- End;
- If Aborted Then Begin
- WriteLn('File processing aborted by operator');
- SuppressOutput := True;
- Inc(ReturnCode);
- End;
- If SetOp = Intersection Then Begin
- HashTab := TestTab;
- TestTab := Nil;
- End Else If Alphabetize And Not SuppressOutput Then Begin
- WriteLn('Maximizing free memory for sort');
- If TestTab <> Nil Then Dispose(TestTab, Done);
- TestTab := Nil;
- End;
- WriteLn('Final Counts: ', WordCount, ' words examined, ',
- HashTab^.TokMaxToken, ' kept, ', (OldMem-MemAvail) Shr 10, 'k in use');
- OutOfMemory := False;
- SortWords;
- WriteLn('Done!');
- Halt(ReturnCode);
- End.
-